Set the working dir
setwd("/mnt/picea/home/ishutava/giovanna")
Libs
suppressPackageStartupMessages(library(DEXSeq))
suppressPackageStartupMessages(library(VennDiagram))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(gplots))
Helpers
source("~/Git/UPSCb/src/R/volcanoPlot.R")
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DEXSeq':
##
## plotMA
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
#source("~/Git/UPSCb/src/R/plotMA.R")
Graphics
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(colors = c("blue","white","red"))(100)
Data
flattenedFile <- "/mnt/picea/storage/reference/Arabidopsis-thaliana/TAIR10/gff/TAIR10_GFF3_MY.gff"
samples <- read.csv("~/Git/UPSCb/projects/arabidopsis-porcupine-RNA-Seq/doc/RNA-seq_pcp_vs_Col-0_info.csv")
samples$genotype <- sub(" .*","",samples$Description)
#basename(flattenedFile)
Function
"run" <- function(inDir,metadata){
countFiles_23 = list.files(inDir, pattern=".txt$", full.names=TRUE)
#'basename(countFiles_23)
condition <- factor(samples[match(sub("_.*","",basename(countFiles_23)),
samples$SampleID),metadata])
sampleTable_23 = data.frame(
row.names = basename(countFiles_23),
condition = condition)
dxd_23 = DEXSeqDataSetFromHTSeq(
countFiles_23,
sampleData=sampleTable_23,
design= ~ sample + exon + condition:exon,
flattenedfile=flattenedFile )
dds_23 <- dxd_23
#------- Deleting rows with all 0's:
ts_23 <- counts(dds_23)
sel_23 <- !rowSums(ts_23 == 0) == ncol(ts_23)
dds_23 <- dds_23[sel_23,]
dxd_23 = estimateSizeFactors( dds_23)
# Dispersion estimation:
dxd_23 = estimateDispersions( dxd_23 )
plotDispEsts( dxd_23 )
# Testing for differential exon usage:
dxd_23 = testForDEU( dxd_23 )
dxd_23 = estimateExonFoldChanges( dxd_23, fitExpToVar="condition")
# Showing the results:
dxr3 = DEXSeqResults( dxd_23 ) #'#' 16 and 23 degrees for pcp's
names(dxr3)[names(dxr3)==grep("log2fold",colnames(dxr3))] <- "log2FoldChange"
# dxr2 = DEXSeqResults( dxd_23 ) #'#' 16 and 23 degrees for col's
# dxr1_23 = DEXSeqResults( dxd_23 ) #'#' 23 degrees WT and mutants
# mcols(dxr3)$description
#table ( dxr3$padj < 0.1 )
# plotMA( dxr3, cex=0.8 )
# plotDEXSeq( dxr3, "AT2G18740", legend=TRUE, cex.axis=1.2, cex=1.3,
# lwd=2 )
# plotDEXSeq( dxr3, "AT2G18740", displayTranscripts=TRUE, legend=TRUE,
# cex.axis=1.2, cex=1.3, lwd=2 )
# plotDEXSeq( dxr3, "AT2G18740", expression=FALSE, norCounts=TRUE,
# legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 )
#my_sel_pcp = matrix(c(dxr3[,1], dxr3[,8]/dxr3[,9], dxr3[,9]/dxr3[,8]), nrow = nrow(dxr3), ncol = 3)
#my_sel_col = matrix(c(dxr2[,1], dxr2[,8]/dxr2[,9], dxr2[,9]/dxr2[,8]), nrow = nrow(dxr2), ncol = 3)
#my_sel_23 = matrix(c(dxr1_23[,1], dxr1_23[,8]/dxr1_23[,9], dxr1_23[,9]/dxr1_23[,8]), nrow = nrow(dxr1_23), ncol = 3)
#volcanoPlot(dxr3)
return(dxr3)
}
## 16 Pcp vs. Col0
dxr1 <- run("16","genotype")
#volcanoPlot(dxr1)
#' ## 23 Pcp vs. Col0
dxr1_23 <- run("23","genotype")
#volcanoPlot(dxr1_23)
#' ## Col0 23 vs. 16
dxr2 <- run("col","Temperature.at.collection")
#volcanoPlot(dxr2)
#' ## PCP 23 vs. 16
dxr3 <- run("pcp","Temperature.at.collection")
#volcanoPlot(dxr3)
ol_16_23 <- VennDiagram::calculate.overlap(list(rownames(dxr1)[dxr1$padj < 0.01 &
abs(dxr1$log2FoldChange) >= 0.5 &
!is.na(dxr1$padj) &
!is.na(dxr1$log2FoldChange)],
rownames(dxr1_23)[dxr1_23$padj < 0.01 &
abs(dxr1_23$log2FoldChange) >= 0.5 &
!is.na(dxr1_23$padj) &
!is.na(dxr1_23$log2FoldChange)]))
ol_col_pcp <- VennDiagram::calculate.overlap(list(rownames(dxr2)[dxr2$padj < 0.01 &
abs(dxr2$log2FoldChange) >= 0.5 &
!is.na(dxr2$padj) &
!is.na(dxr2$log2FoldChange)],
rownames(dxr3)[dxr3$padj < 0.01 &
abs(dxr3$log2FoldChange) >= 0.5 &
!is.na(dxr3$padj) &
!is.na(dxr3$log2FoldChange)]))
grid.newpage()
draw.pairwise.venn(length(ol_16_23$a1), length(ol_16_23$a2), length(ol_16_23$a3),
c("16 degree", "23 degree"), fill = rainbow(2), alpha = 0.6)
## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], text[GRID.text.17], lines[GRID.lines.18], text[GRID.text.19], text[GRID.text.20])
grid.newpage()
draw.pairwise.venn(length(ol_col_pcp$a1), length(ol_col_pcp$a2), length(ol_col_pcp$a3),
c("col", "pcp"), fill = rainbow(2), alpha = 0.6)
## (polygon[GRID.polygon.21], polygon[GRID.polygon.22], polygon[GRID.polygon.23], polygon[GRID.polygon.24], text[GRID.text.25], text[GRID.text.26], lines[GRID.lines.27], text[GRID.text.28], lines[GRID.lines.29], text[GRID.text.30], text[GRID.text.31])
write.table(ol_16_23$a3, file = "/mnt/picea/home/ishutava/giovanna/table_16_23.csv", append = TRUE, quote = TRUE, sep = " ",
eol = "\n", na = "NA", dec = ".", row.names = TRUE,
col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
## Warning in write.table(ol_16_23$a3, file = "/mnt/picea/home/ishutava/
## giovanna/table_16_23.csv", : appending column names to file
write.table(ol_col_pcp$a3,file = "/mnt/picea/home/ishutava/giovanna/table_col_pcp.csv", append = TRUE, quote = TRUE, sep = " ",
eol = "\n", na = "NA", dec = ".", row.names = TRUE,
col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
## Warning in write.table(ol_col_pcp$a3, file = "/mnt/picea/home/ishutava/
## giovanna/table_col_pcp.csv", : appending column names to file
write.table(as.matrix(ol_16_23), file = "/mnt/picea/home/ishutava/giovanna/table_16_23_1.csv", append = TRUE, quote = TRUE, sep = " ",
eol = "\n", na = "NA", dec = ".", row.names = TRUE,
col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
## Warning in write.table(as.matrix(ol_16_23), file = "/mnt/picea/home/
## ishutava/giovanna/table_16_23_1.csv", : appending column names to file
write.table(as.matrix(ol_col_pcp),file = "/mnt/picea/home/ishutava/giovanna/table_col_pcp_1.csv", append = TRUE, quote = TRUE, sep = " ",
eol = "\n", na = "NA", dec = ".", row.names = TRUE,
col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
## Warning in write.table(as.matrix(ol_col_pcp), file = "/mnt/picea/home/
## ishutava/giovanna/table_col_pcp_1.csv", : appending column names to file
save(ol_16_23, ol_col_pcp, dxr1_23, dxr1, dxr2, dxr3, file="/mnt/picea/home/ishutava/giovanna/var.RData")
for (i in 1:length(ol_16_23$a3)){
plotDEXSeq( dxr1, substr(ol_16_23$a3[i],1,regexpr(':', ol_16_23$a3[i])-1), expression=FALSE, norCounts=TRUE,
legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 )
plotDEXSeq( dxr1_23, substr(ol_16_23$a3[i],1,regexpr(':', ol_16_23$a3[i])-1), expression=FALSE, norCounts=TRUE,
legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 )
}
for (i in 1:length(ol_col_pcp$a3)){
plotDEXSeq( dxr2, substr(ol_col_pcp$a3[i],1,regexpr(':', ol_col_pcp$a3[i])-1), expression=FALSE, norCounts=TRUE,
legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 )
plotDEXSeq( dxr3, substr(ol_col_pcp$a3[i],1,regexpr(':', ol_col_pcp$a3[i])-1), expression=FALSE, norCounts=TRUE,
legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 )
}
——– Maximum of the -log10(dxr1$padj ———– “AT1G74870:E001”
s <- !is.na(-log10(dxr1$padj))
rt1<- rownames(dxr1)[-log10(dxr1$padj[s]) == (max(-log10(dxr1$padj[s])))]
“AT2G31902:E004”
s_23 <- !is.na(-log10(dxr1_23$padj))
rt1_23 <- rownames(dxr1_23)[-log10(dxr1_23$padj[s_23]) == (max(-log10(dxr1_23$padj[s_23])))]
tx16 <- dxr1$transcripts[
dxr1$padj < 0.01 &
abs(dxr1$log2FoldChange) >= 0.5 &
!is.na(dxr1$padj)]
tx23 <- dxr1_23$transcripts[
dxr1_23$padj < 0.01 &
abs(dxr1_23$log2FoldChange) >= 0.5 &
!is.na(dxr1_23$padj)]
txcol <- dxr2$transcripts[
dxr2$padj < 0.01 &
abs(dxr2$log2FoldChange) >= 0.5 &
!is.na(dxr2$padj)]
txpcp <- dxr3$transcripts[
dxr3$padj < 0.01 &
abs(dxr3$log2FoldChange) >= 0.5 &
!is.na(dxr3$padj)]
plot.new()
grid.draw(venn.diagram(list(
t16=tx16,t23=tx23,col=txcol,pcp=txpcp),
filename=NULL,
col=pal[1:4],
category.names=c("t16","t23","col","pcp")))
Define the cutoffs
padj <- 0.01
lfc <- 0.5
extract the data
dl <- lapply(list(dxr1,dxr1_23,dxr2,dxr3),function(d){
d <- d[d$padj < 0.01 &
abs(d$log2FoldChange) >= 0.5 &
!is.na(d$padj) &
!is.na(d$log2FoldChange),]
})
verify
plot.new()
grid.draw(venn.diagram(lapply(dl,rownames),
filename=NULL,
col=pal[1:4],
category.names=c("t16","t23","col","pcp")))
extract the exon of interest
eoi <- unique(unlist(lapply(dl,rownames)))
sub select
ds <- lapply(list(dxr1,dxr1_23,dxr2,dxr3), function(d,eoi){
d[match(eoi,rownames(d)),c(8,9)]
},eoi)
Combine into a df
df <- as.data.frame(do.call(cbind,ds))
colnames(df) <- c("Col0.16","pcp.16",
"Col0.23","pcp.23",
"16.Col0","23.Col0",
"16.pcp","23.pcp"
)
replace NAs
df[is.na(df)] <- 0
PCA
pc <- prcomp(t(df))
percentage <- round(summary(pc)$importance[2,]*100,digits=2)
plot(pc$x[,1],pc$x[,2],col=rep(pal[1:4],each=2),pch=rep(c(25,24),4),main="PCA of the exon usage",
xlab=paste("PC1",percentage[1],"%"),
ylab=paste("PC2",percentage[2],"%"))
text(pc$x[1,1],pc$x[1,2],colnames(df)[1],adj = c(-0.2,0),col=rep(pal[1:4],each=2)[1])
text(pc$x[c(2,4,5,7,8),1],pc$x[c(2,4,5,7,8),2],colnames(df)[c(2,4,5,7,8)],adj = c(-0.2,-1),col=rep(pal[1:4],each=2)[c(2,4,5,7,8)])
text(pc$x[c(3,6),1],pc$x[c(3,6),2],colnames(df)[c(3,6)],adj = c(1.2,1),col=rep(pal[1:4],each=2)[c(3,6)])
pc <- prcomp(df)
percentage <- round(summary(pc)$importance[2,]*100,digits=2)
mat <- sapply(
lapply(lapply(dl,rownames),
function(d){eoi %in% d}),as.integer)
fac <- factor(apply(mat,
1,paste0,collapse=""))
fc <- sapply(seq(2,8,2),function(i){sign(df[,i]-df[,i-1])})
cex <- rep(24,length(fac))
cex[fac=="1000"][fc[fac=="1000",1] == -1] <- 25
cex[fac=="0100"][fc[fac=="0100",2] == -1] <- 25
cex[fac=="0010"][fc[fac=="0010",3] == -1] <- 25
cex[fac=="0001"][fc[fac=="0001",4] == -1] <- 25
plot(pc$x[,1],pc$x[,2],
col=pal[as.integer(fac)],
pch=cex)
col <- rep(0,length(fac))
col[fac=="1000"][fc[fac=="1000",1] == -1] <- pal[2]
col[fac=="1000"][fc[fac=="1000",1] == 1] <- pal[1]
col[fac=="0100"][fc[fac=="0100",2] == -1] <- pal[4]
col[fac=="0100"][fc[fac=="0100",2] == 1] <- pal[3]
col[fac=="0010"][fc[fac=="0010",3] == -1] <- pal[6]
col[fac=="0010"][fc[fac=="0010",3] == 1] <- pal[5]
col[fac=="0001"][fc[fac=="0001",4] == -1] <- pal[8]
col[fac=="0001"][fc[fac=="0001",4] == 1] <- pal[7]
plot(pc$x[,1],pc$x[,2],
col=col,
pch=cex)
legend("topright",colnames(df),col=pal,bty="n",pch=rep(c(24,25),4))
sels <- apply(mat,2,"==",1)
sapply(1:4,function(i){
sel <- sels[,i]
plot(pc$x[,1][sel],pc$x[,2][sel],
col=ifelse(fc[sel,i]==-1,pal[i*2],pal[(i*2)-1]),
pch=ifelse(fc[sel,i]==-1,24,25),
xlim=range(pc$x[,1]),
ylim=range(pc$x[,2]))
legend("topright",colnames(df),col=pal,bty="n",pch=rep(c(25,24),4))
})
## [,1] [,2] [,3] [,4]
## rect List,4 List,4 List,4 List,4
## text List,2 List,2 List,2 List,2
Fold changes
dfc <- sapply(list(dxr1,dxr1_23,dxr2,dxr3), function(d,eoi){
d[match(eoi,rownames(d)),"log2FoldChange"]
},eoi)
colnames(dfc) <- c("pcp.vs.Col0.16",
"pcp.vs.Col0.23",
"23.vs.16.Col0",
"23.vs.16.pcp"
)
replace NAs
dfc[is.na(dfc)] <- 0
PCA
pc <- prcomp(t(dfc))
percentage <- round(summary(pc)$importance[2,]*100,digits=2)
plot(pc$x[,1],pc$x[,2],col=rep(pal[1:4]),
main="PCA of the exon usage",
xlab=paste("PC1",percentage[1],"%"),
ylab=paste("PC2",percentage[2],"%"))
legend("center",colnames(dfc),col=pal[1:4],pch=18)
Heatmap
heatmap.2(as.matrix(df),scale="row",col=hpal,trace="none",
labRow=FALSE,cexCol = 1)
heatmap.2(as.matrix(df[,5:8]),scale="row",col=hpal,trace="none",
labRow=FALSE,cexCol = 1)
heatmap.2(as.matrix(df[,1:4]),scale="row",col=hpal,trace="none",
labRow=FALSE,cexCol = 1)
Check the vst
vsd <- read.csv("/mnt/picea/projects/arabidopsis/mschmid/porcupine-RNA-Seq/analysis/HTSeq/library-size-normalized_variance-stabilized_data.csv",
row.names = 1)
samples <- read.csv("~/Git/UPSCb/projects/arabidopsis-porcupine-RNA-Seq/doc/RNA-seq_pcp_vs_Col-0_info.csv")
samples$label <- paste(sub(" .*","",samples$Description),samples$Temperature.at.collection)
dat <- as.matrix(na.omit(vsd[unique(unlist(strsplit(sub(":E[0-9]+","",rownames(df)),"\\+"))),]))
heatmap.2(dat,scale="row",col=hpal,trace="none",
labRow=FALSE,cexCol = 1,labCol = samples[match(sub("X","",colnames(vsd)),samples$SampleID),"label"])
#dat[dat < 1] <- 1
dat[dat > 7 ] <- 7
heatmap.2(dat,
col=hpal,trace="none",
labRow=FALSE,cexCol = 1,labCol = samples[match(sub("X","",colnames(vsd)),samples$SampleID),"label"])
Last check
nams <- intersect(rownames(dxr2),rownames(dxr3))
library(LSD)
heatscatter(dxr2[match(nams,rownames(dxr2)),4],dxr3[match(nams,rownames(dxr3)),4])
heatscatter(dxr2[match(nams,rownames(dxr2)),8],dxr3[match(nams,rownames(dxr3)),8],
xlab="Col0 23 vs. 16",main="exon expression @ 16",
ylab="pcp 23 vs. 16")
abline(1,1,col="grey",lwd=2,lty=2)
heatscatter(dxr2[match(nams,rownames(dxr2)),9],dxr3[match(nams,rownames(dxr3)),9],
xlab="Col0 23 vs. 16",main="exon expression @ 23",
ylab="pcp 23 vs. 16")
abline(1,1,col="grey",lwd=2,lty=2)
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] LSD_3.0 limma_3.32.2
## [3] gplots_3.0.1 VennDiagram_1.6.17
## [5] futile.logger_1.4.3 DEXSeq_1.22.0
## [7] RColorBrewer_1.1-2 AnnotationDbi_1.38.1
## [9] DESeq2_1.16.1 SummarizedExperiment_1.6.3
## [11] DelayedArray_0.2.7 matrixStats_0.52.2
## [13] GenomicRanges_1.28.3 GenomeInfoDb_1.12.2
## [15] IRanges_2.10.2 S4Vectors_0.14.3
## [17] Biobase_2.36.2 BiocGenerics_0.22.0
## [19] BiocParallel_1.10.1
##
## loaded via a namespace (and not attached):
## [1] bit64_0.9-7 splines_3.4.0
## [3] gtools_3.5.0 Formula_1.2-1
## [5] statmod_1.4.30 latticeExtra_0.6-28
## [7] blob_1.1.0 GenomeInfoDbData_0.99.0
## [9] Rsamtools_1.28.0 yaml_2.1.14
## [11] RSQLite_2.0 backports_1.1.0
## [13] lattice_0.20-35 digest_0.6.12
## [15] XVector_0.16.0 checkmate_1.8.2
## [17] colorspace_1.3-2 htmltools_0.3.6
## [19] Matrix_1.2-10 plyr_1.8.4
## [21] XML_3.98-1.9 biomaRt_2.32.1
## [23] genefilter_1.58.1 zlibbioc_1.22.0
## [25] xtable_1.8-2 scales_0.4.1
## [27] gdata_2.18.0 htmlTable_1.9
## [29] tibble_1.3.3 annotate_1.54.0
## [31] ggplot2_2.2.1 nnet_7.3-12
## [33] lazyeval_0.2.0 survival_2.41-3
## [35] magrittr_1.5 memoise_1.1.0
## [37] evaluate_0.10 hwriter_1.3.2
## [39] foreign_0.8-68 tools_3.4.0
## [41] data.table_1.10.4 stringr_1.2.0
## [43] munsell_0.4.3 locfit_1.5-9.1
## [45] cluster_2.0.6 lambda.r_1.1.9
## [47] Biostrings_2.44.1 compiler_3.4.0
## [49] caTools_1.17.1 rlang_0.1.1
## [51] RCurl_1.95-4.8 htmlwidgets_0.8
## [53] bitops_1.0-6 base64enc_0.1-3
## [55] rmarkdown_1.6 gtable_0.2.0
## [57] DBI_0.7 gridExtra_2.2.1
## [59] knitr_1.16 bit_1.1-12
## [61] Hmisc_4.0-3 rprojroot_1.2
## [63] futile.options_1.0.0 KernSmooth_2.23-15
## [65] stringi_1.1.5 Rcpp_0.12.11
## [67] geneplotter_1.54.0 rpart_4.1-11
## [69] acepack_1.4.1